ftp.cs.arizona.edu

home *** CD-ROM | disk | FTP | other *** search

/ ftp.cs.arizona.edu / ftp.cs.arizona.edu.tar / ftp.cs.arizona.edu / icon / newsgrp / group96b.txt / 000042_icon-group-sender _Fri Oct 25 11:00:09 1996.msg < prev next >

Wrap

Internet Message Format | 1997-01-02 | 9KB

Received: by cheltenham.cs.arizona.edu; Fri, 25 Oct 1996 12:58:04 MST Date: Fri, 25 Oct 1996 11:00:09 -0700 From: Gregg Townsend <gmt> Message-Id: <9610251800.AA12975@hawk.CS.Arizona.EDU> To: nr@viper.cs.Virginia.EDU Subject: Re: Icon code for working with HTML Cc: icon-group Errors-To: icon-group-errors@cs.arizona.edu Here's some code for parsing HTML files. It isn't exactly what you're looking for, but it may help. Gregg Townsend / gmt@CS.Arizona.EDU / +1 520 621 4325 / 32 13 45N 110 57 16W Computer Science / Univ of Arizona / 1040 E 4th St / Tucson AZ 85721-0077 ############################################################################ # # File: html.icn # # Subject: Procedures for parsing HTML # # Author: Gregg M. Townsend # # Date: August 12, 1996 # ############################################################################ # # These procedures parse HTML files: # # htchunks(f) generates the basic chunks -- tags and text -- # that compose an HTML file. # # htrefs(f) generates the tagname/keyword/value combinations # that reference other files. # # These procedures process strings from HTML files: # # httag(s) extracts the name of a tag. # # htvals(s) generates the keyword/value pairs from a tag. # # urlmerge(base,new) interprets a new URL in the context of a base. # ############################################################################ # # htchunks(f) generates the HTML chunks from file f. # It returns strings beginning with # # ') + 3) then fail # normal case: discard comment s ||:= tab(0) &subject := (read(f) || "\n") | break } &subject := s # rescan unclosed comment return "<!--" # return error indicator end procedure htc_text(f) local s s := "" repeat { if s ||:= tab(upto('<')) then return s s ||:= tab(0) &subject := (read(f) || "\n") | return s } end ## htrefs(f) -- generate references from HTML file f procedure htrefs(f) #: generate references from HTML file local tag, tagname, kwset, s static ttable initial { ttable := table() ttable["A"] := set(["HREF"]) ttable["ARE"] := set(["HREF"]) ttable["BASE"] := set(["HREF"]) ttable["BODY"] := set(["BACKGROUND"]) ttable["FORM"] := set(["ACTION"]) ttable["IMG"] := set(["SRC", "LOSRC", "USEMAP"]) ttable["INPUT"] := set(["SRC"]) ttable["LINK"] := set(["HREF"]) } every tag := htchunks(f) do { tagname := httag(tag) | next kwset := \ttable[tagname] | next every s := htvals(tag) do if member(kwset, s ? tab(upto(' '))) then suspend tagname || " " || s } end ## httag(s) -- return the name of the HTML tag s procedure httag(s) #: extract name of HTML tag static idset, wset, lcase, ucase initial { idset := &letters ++ &digits ++ '.-' wset := ' \t\r\n\v\f' lcase := string(&lcase) ucase := string(&ucase) } s ? { ="<" | fail tab(many(wset)) return map(tab(many(idset)), lcase, ucase) } end ## htvals(s) -- generate tag values of HTML tag s procedure htvals(s) #: generate values in HTML tag local kw static idset, wset, qset, lcase, ucase initial { idset := &letters ++ &digits ++ '.-' wset := ' \t\r\n\v\f' qset := wset ++ '>' lcase := string(&lcase) ucase := string(&ucase) } s ? { ="<" | fail tab(many(wset)) tab(many(idset)) | fail # no name repeat { tab(upto(idset)) | fail kw := map(tab(many(idset)), lcase, ucase) tab(many(wset)) if ="=" then { tab(many(wset)) kw ||:= " " if ="\"" then { kw ||:= tab(upto('"') | 0) tab(any('"')) } else if ="'" then { kw ||:= tab(upto('\'') | 0) tab(any('\'')) } else kw ||:= tab(upto(qset) | 0) } suspend kw } } end # urlmerge(base,new) -- merge URLs procedure urlmerge(base, new) #: merge URLs local protocol, host, path static notslash initial notslash := ~'/' if new ? (tab(many(&letters)) & =":") then return new # new is fully specified base ? { protocol := (tab(many(&letters)) || =":") | "" host := (="//" || tab(upto('/') | 0)) | "" path := tab(upto('#') | 0) } new ? { if ="#" then return protocol || host || path || new if ="/" then return protocol || host || new while (="." & (="/" | pos(0))) | (=".." & (="/" | pos(0)) & (path := url_trim(path))) return protocol || host || trim(path, notslash) || tab(0) } end # url_trim(path) -- trim trailing dir provided that at least one "/" remains procedure url_trim(path) static notslash initial notslash := ~'/' reverse(path) ? { # work from back end tab(upto('/') + 1) | fail # trim dir, fail if no "/" if =".." & (="/" | pos(0)) then fail # don't trim a ".." component path := reverse(tab(0)) # otherwise use the rest } if upto('/', path) then # one / must remain to be valid return path else fail end